# Figure 1. Word frequency by question version

# Clear all
rm(list=ls(all=TRUE))

# Load packages
library(foreign)
require(quanteda)
quanteda_options(language_stemmer = "nor")

# Load data
path <- "P:/2017-pathways/new/2-data"
setwd(path)
rawdata <- read.csv2("pathways-text-data.csv", header=TRUE, sep=";", stringsAsFactor=FALSE)
load("pathways-merged-intermediate.Rdata") # this file created by "pathways-stm.R", also see "readme-flow-master.txt"
subs <- read.csv(file="2019-v1-substitution_list_no.csv", header=TRUE, sep=";", stringsAsFactors = FALSE)
dictionary <- read.csv("dictionary.csv", sep=";")

# List the 3 wordings/treatments
explabels <- c("Oil/gas", "Energy", "Transition")

# Get rid of all variables except openanswer and treatment 
leandata <- data[,c(5,6)]
leandata <- leandata[nchar(leandata$openanswer)>2,]
levels(leandata$treatment) <- explabels

# load and run the substitution list 
colnames(subs) <- c("orig", "replace")

# create loop for each data frame
leandata$openanswer.orig <- leandata$openanswer
oa <- tolower(leandata$openanswer)
for(i in 1:length(oa)) {
  for(j in 1:length(subs$orig)) {
    oa[i] <- gsub(paste0('\\b', 
                         subs$orig[j], 
                         # "[a-zA-Z]*", 
                         '\\b'), 
                  subs$replace[j], 
                  oa[i], 
                  ignore.case=TRUE)  # bespoke spelling corrections, spacing, standardization
  }}
# guidance on regex: https://stackoverflow.com/questions/2790813/regular-expression-a-za-z-or-a-za-z
leandata$openanswer <- oa

# word frequency diagrams
diagsize <- 20  # set size
stopwordlist <- list(c("olje", "gass"),
                     "energi", 
                     c("omstill", "omstil"))

# Figure 1: Word frequency by experimental treatment, and total
# ... by experimental treatment
par(mfrow=c(2,2))
par(mar=c(4,6,.2,.2))# (4,5,.2,.2))
quanteda_options(language_stemmer = "nor")
for (i in 1:3){ 
  pw.analyze.dfm <- dfm(leandata$openanswer[leandata$treatment==explabels[i]], 
                        verbose=TRUE, tolower=TRUE, 
                        remove_numbers=TRUE, remove_punct=TRUE, 
                        remove_separators = TRUE, stem=TRUE,
                        remove = c(stopwords("norwegian"), stopwordlist[[i]])) 
  wordfreq <- colSums(pw.analyze.dfm)
  wordfreq.sorted <- sort(wordfreq, decreasing=FALSE)
  # list top 30 for reference
  print(tail(wordfreq.sorted, 30))
  # In English
  wdf <- as.data.frame(tail(wordfreq.sorted, diagsize))
  wdf$names <- rownames(wdf)
  colnames(wdf) <- c("freq", "names")
  barplot(height    = wdf$freq,
          names.arg = wdf$names,
          las=2,  xlab=paste("Word frequency, question: ", explabels[i]), 
          xlim=c(0,350),
          horiz=TRUE,
          cex.names=.7)
}
# ... 4th, total graph (all three treatments)
pw.analyze.dfm <- dfm(leandata$openanswer, 
                      verbose=TRUE, tolower=TRUE, 
                      remove_numbers=TRUE, remove_punct=TRUE, 
                      remove_separators = TRUE, stem=TRUE,
                      remove = c(stopwords("norwegian"), stopwordlist[[i]])) 
wordfreq <- colSums(pw.analyze.dfm)
wordfreq.sorted <- sort(wordfreq, decreasing=FALSE)
tail(wordfreq.sorted, 10)# Plot it
# par(mar=c(4,5,.5,.5))
wdf <- as.data.frame(tail(wordfreq.sorted, diagsize))
wdf$names <- rownames(wdf)
colnames(wdf) <- c("freq", "names")
# Merge translations
barplot(height    = wdf$freq,
        names.arg = wdf$names,
        las=2,  xlab="Word frequency: Total", 
        xlim=c(0,350),
        horiz=TRUE,
        cex.names=.8)
# End 4-panel graph (Figure 1)